1.1 Combine and clean the data
## Filter useful variables and remove duplicate events
athletes1 <- filter(athletes, Season != "Summer") %>%
select(NOC, Team, Sex, Year, Sport, Event, Medal) %>%
distinct()
## Group by countries and calculate medals and games competing in
Events <- athletes1 %>%
group_by(NOC, Medal) %>%
summarise(total = n())
## `summarise()` has grouped output by 'NOC'. You can override using the `.groups` argument.
Medal <- spread(
Events,
key = "Medal",
value = "total")
colnames(Medal)[5] <- "Participate"
Medal[is.na(Medal)] <- 0
Medal <- mutate(Medal, Total = Gold + Silver + Bronze + Participate,
TotalMedal = Gold + Silver + Bronze) %>%
arrange(desc(Gold))
## Every country with different code are calculated together in sports analysis
## Sum up the same countries in different code and append gdp in gdp analysis. Reserve only those who have names in gdppop data
Medal <- right_join(noc, Medal, by = "NOC")
country <- aggregate(cbind(Gold, Silver, Bronze, TotalMedal) ~ region, data = Medal, sum) %>%
arrange(desc(TotalMedal))
1.2 Over time comparation and Medals comparation
## Over Time table creation, the plot including 10 moat successful countries in Winter Olympics
Time <- athletes1 %>%
group_by(NOC, Year, Medal) %>%
summarise(total = n())
## `summarise()` has grouped output by 'NOC', 'Year'. You can override using the `.groups` argument.
Time <- right_join(noc, Time, by = "NOC")
Time <- spread(
Time,
key = "Medal",
value = "total")
Time <- Time[, -3]
Time[is.na(Time)] <- 0
Time <- mutate(Time,
Total = Gold + Silver + Bronze) %>%
arrange(desc(Year),desc(Gold))
top10 <- country[1:10, ]
top10overtime <- Time[Time$region %in% top10$region, ]
## Picture here with top10 countries
library(ggthemes)
top10hc <- hchart(top10overtime, "scatter",
hcaes(x = Year,
y = Total,
group = region))
top10hc
Sex <- athletes1 %>%
filter(Medal != "Na") %>%
group_by(NOC, Sex) %>%
summarise(total = n())
## `summarise()` has grouped output by 'NOC'. You can override using the `.groups` argument.
Sex <- spread(
Sex,
key = "Sex",
value = "total")
Medal <- left_join(Medal, Sex, by = "NOC")
Medal$F[is.na(Medal$F)] <- 0
Medal$M[is.na(Medal$M)] <- 0
Medal <- aggregate(cbind(Gold, Silver, Bronze, TotalMedal, Participate, Total, F, M) ~ region, data = Medal, sum) %>%
arrange(desc(TotalMedal))
Medalsex <- mutate(Medal, genderratio = F/M)
Medalsex <- Medalsex[1:10, ]
head(Medal, 10) %>% slice
## region Gold Silver Bronze TotalMedal Participate Total F M
## 1 Germany 139 138 113 390 988 1378 193 197
## 2 Russia 156 117 108 381 752 1133 176 205
## 3 Norway 112 108 100 320 627 947 59 261
## 4 USA 97 104 89 290 1024 1314 125 165
## 5 Austria 61 80 82 223 649 872 75 148
## 6 Canada 65 60 56 181 864 1045 88 93
## 7 Finland 42 63 57 162 470 632 42 120
## 8 Sweden 50 40 54 144 587 731 38 106
## 9 Switzerland 50 40 47 137 583 720 40 97
## 10 Italy 37 34 45 116 788 904 41 75
topF <- Medal %>%
select(region, F) %>%
arrange(desc(F))
head(topF, 10) %>% slice
## region F
## 1 Germany 193
## 2 Russia 176
## 3 USA 125
## 4 Canada 88
## 5 Austria 75
## 6 Norway 59
## 7 France 44
## 8 Finland 42
## 9 Italy 41
## 10 China 41
topM <- Medal %>%
select(region, M) %>%
arrange(desc(M))
head(topM, 10) %>% slice
## region M
## 1 Norway 261
## 2 Russia 205
## 3 Germany 197
## 4 USA 165
## 5 Austria 148
## 6 Finland 120
## 7 Sweden 106
## 8 Switzerland 97
## 9 Canada 93
## 10 Italy 75
## The aggregated medal rank with a gender ratio perspective.
## The ratio closer to 1, the more equal for female and male to gain a medal in this country.
Medalcount <- ggplot(Medalsex, aes(x = TotalMedal, y = reorder(region, TotalMedal), size = Total)) +
geom_point(aes(color = genderratio)) +
labs(x="TotalMedal", y="region")
Medalcount
## Another pie chart of medals gained by top10 countries
Medalcount2 <- ggplot(Medalsex,
aes(x = "", y = TotalMedal, fill = region)
) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
theme_void()
Medalcount2
gdp <- right_join(noc, Medal, by = "region")
gdp <- gdp[-which(gdp$notes =="Hong Kong"), ]
gdp <- inner_join(gdp, gdppop, by = c("NOC" = "Code"))
gdp <- mutate(gdp, medal_gdp = 10^3 * (Gold * 3 + Silver *2 + Bronze) / gdp$'GDP per Capita',
medal_pop = 10^6 * (Gold * 3 + Silver *2 + Bronze) / Population)
medal_gdp <- select(gdp, region, TotalMedal, medal_gdp) %>%
arrange(desc(medal_gdp))
slice(medal_gdp)
## # A tibble: 107 x 3
## region TotalMedal medal_gdp
## <chr> <dbl> <dbl>
## 1 Russia 381 89.1
## 2 Germany 390 19.5
## 3 China 58 13.5
## 4 USA 290 10.5
## 5 Austria 223 9.71
## 6 Norway 320 8.76
## 7 Canada 181 8.58
## 8 Italy 116 7.48
## 9 Finland 162 7.30
## 10 Ukraine 8 6.15
## # ... with 97 more rows
medal_pop <- select(gdp, region, TotalMedal, medal_pop) %>%
arrange(desc(medal_pop))
slice(medal_pop)
## # A tibble: 107 x 3
## region TotalMedal medal_pop
## <chr> <dbl> <dbl>
## 1 Liechtenstein 9 400.
## 2 Norway 320 125.
## 3 Finland 162 56.4
## 4 Austria 223 49.4
## 5 Switzerland 137 33.4
## 6 Sweden 144 29.0
## 7 Netherlands 110 13.1
## 8 Estonia 7 13.0
## 9 Slovenia 15 11.1
## 10 Canada 181 10.3
## # ... with 97 more rows
## Remove some tiny countries with <= 20 medals
medal_gdp <- medal_gdp %>%
filter(TotalMedal > 20)
medal_pop <- medal_pop %>%
filter(TotalMedal > 20)
gdpplot <- ggplot(medal_gdp, aes(x=reorder(region, medal_gdp), y = medal_gdp, fill = region, text = paste("Total:", TotalMedal))) +
geom_bar(stat="identity") +
coord_flip() +
xlab("region") +
ggtitle("Great Power Competition",
subtitle = "Calculate by GDP per capita * 10^3")
ggplotly(gdpplot)
popplot <- ggplot(medal_pop, aes(x=reorder(region, medal_pop), y = medal_pop, fill = region, text = paste("Total:", TotalMedal))) +
geom_bar(stat="identity") +
coord_flip() +
xlab("region") +
ggtitle("Advantage of King in the North",
subtitle = "Calculate by population * 10^6")
ggplotly(popplot)
3.Host Country Advantage
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[2]], fill=TRUE)[-1]
hosts <- hosts %>% filter(Winter != "") %>%
select(City, Country, Year)
## Modify over time df to gain meanmedals of every country
Time <- Time[, -7]
general <- Time %>%
select(region, Gold, Total) %>%
group_by(region) %>%
summarize(meanmedal = mean(Total), meangold = mean(Gold)) %>%
arrange(desc(meanmedal))
slice(general)
## # A tibble: 111 x 3
## region meanmedal meangold
## <chr> <dbl> <dbl>
## 1 Russia 23.8 9.75
## 2 Germany 15 5.35
## 3 Norway 14.5 5.09
## 4 USA 13.2 4.41
## 5 Austria 10.1 2.77
## 6 Canada 8.23 2.95
## 7 Finland 7.36 1.91
## 8 Sweden 6.55 2.27
## 9 Switzerland 6.23 2.27
## 10 Netherlands 5.5 1.85
## # ... with 101 more rows
##Modify host countries official names and append meanmedals
hosts$Country[hosts$Country == "United States"] <- "USA"
hosts$Country[hosts$Country == "Russia[h]"] <- "Russia"
hosts$Country[hosts$Country == "Yugoslavia"] <- "Serbia"
hostmedal <- merge(hosts, Time, by.x = c("Country", "Year"), by.y = c("region", "Year"))
hostmedal <- hostmedal %>%
select(Year, Country, Gold, Total)
hostmedal <- merge(hostmedal, general, by.x = "Country", by.y = "region")
hostmedal <- mutate(hostmedal, hostadvantage = (Total - meanmedal) * 0.7 + (Gold - meangold) * 0.3) %>%
arrange(desc(hostadvantage))
slice(hostmedal)
## Country Year Gold Total meanmedal meangold hostadvantage
## 1 Canada 2010 15 27 8.2272727 2.954545 16.7545455
## 2 USA 2002 10 34 13.1818182 4.409091 16.2500000
## 3 Russia 2014 15 38 23.8125000 9.750000 11.5062500
## 4 Norway 1994 8 23 14.5454545 5.090909 6.7909091
## 5 Japan 1998 5 10 2.2500000 0.500000 6.7750000
## 6 Italy 2006 5 11 5.2727273 1.681818 5.0045455
## 7 France 1992 3 10 5.2272727 1.545455 3.7772727
## 8 France 1968 4 9 5.2272727 1.545455 3.3772727
## 9 Switzerland 1948 3 9 6.2272727 2.272727 2.1590909
## 10 Austria 1964 4 12 10.1363636 2.772727 1.6727273
## 11 Norway 1952 7 16 14.5454545 5.090909 1.5909091
## 12 Japan 1972 1 3 2.2500000 0.500000 0.6750000
## 13 Serbia 1984 0 1 0.2105263 0.000000 0.5526316
## 14 USA 1932 6 13 13.1818182 4.409091 0.3500000
## 15 USA 1980 6 12 13.1818182 4.409091 -0.3500000
## 16 France 1924 0 4 5.2272727 1.545455 -1.3227273
## 17 Italy 1956 1 3 5.2727273 1.681818 -1.7954545
## 18 USA 1960 3 11 13.1818182 4.409091 -1.9500000
## 19 Canada 1988 0 6 8.2272727 2.954545 -2.4454545
## 20 Austria 1976 2 6 10.1363636 2.772727 -3.1272727
## 21 Switzerland 1928 0 1 6.2272727 2.272727 -4.3409091
## 22 Germany 1936 4 7 15.0000000 5.346154 -6.0038462
## Standard of hostadvantage = (Total - meanmedal) * 0.7 + (Gold - meangold) * 0.3
## Total = Medals gained during host Olympics this time
## Gold = Gold medals gained during host Olympics this time
## meanmedal = Medals gained during host Olympics in average
## meangold = Gold medals gained during host Olympics in average
## Based on hostadvantage to create a plotly plot
hostplot <- hostmedal %>%
plot_ly(x = ~Year, y = ~hostadvantage) %>%
add_lines() %>%
layout(title = "Hostadvantage Time Series")
hostplot <- add_markers(hostplot, size = ~Total, color = ~Country)
hostplot
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
athletes2 <- filter(athletes, Season != "Summer") %>%
group_by(Name, Medal) %>%
summarise(total = n())
## `summarise()` has grouped output by 'Name'. You can override using the `.groups` argument.
success <- spread(athletes2, key = "Medal", value = "total") %>%
arrange(desc(Gold))
success[is.na(success)] <- 0
success <- mutate(success, total = Bronze + Silver + Gold,
grade = Gold * 3 + Silver *2 + Bronze)%>%
arrange(desc(grade))
Body <- filter(athletes, Season != "Summer") %>%
select(Name, Sex, Height, Weight, Team) %>%
group_by(Name, Sex, Team) %>%
summarise(height = mean(Height),
weight = mean(Weight))
## `summarise()` has grouped output by 'Name', 'Sex'. You can override using the `.groups` argument.
Body <- distinct(Body, Name, .keep_all = TRUE)
success <- inner_join(success, Body, by = "Name")
slice(success)
## # A tibble: 18,925 x 11
## # Groups: Name [18,923]
## Name Bronze Gold Silver `<NA>` total grade Sex Team height weight
## <chr> <int> <int> <int> <int> <int> <dbl> <chr> <chr> <dbl> <dbl>
## 1 "Ole Einar~ 1 8 4 14 13 33 M Norway 178 65
## 2 "Marit Bjr~ 1 6 3 9 10 25 F Norway 168 64
## 3 "Lyubov Iv~ 0 6 3 3 9 24 F Russia 167 58
## 4 "Raisa Pet~ 1 4 5 5 10 23 F Sovie~ 162 53
## 5 "Claudia P~ 2 5 2 6 9 21 F Germa~ 166 61
## 6 "Viktor An" 2 6 0 2 8 20 M Russia 170 65
## 7 "Edy Sixte~ 2 4 3 3 9 20 M Sweden 177 72
## 8 "Yang Yang" 2 2 6 9 10 20 F China 166. 58.9
## 9 "Irene Kar~ 1 4 3 5 8 19 F Nethe~ 168 65
## 10 "Ricco Gro" 1 4 3 7 8 19 M Germa~ 179 75
## # ... with 18,915 more rows
## standard of success = Gold * 3 + Silver *2 + Bronze
bodyplot <- success %>%
filter(grade >= 10) %>%
plot_ly(x = ~weight, y = ~height, type = "scatter", mode = "markers", color = ~Sex, size = ~grade, text = ~paste('Name: ', Name))
bodyplot
## Warning: Ignoring 3 observations
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
6.datatable
library(DT)
## Medaldatatable has been ranged by Totalmedals while creating
## Countries who at least have one particiption has been calculated
## Of course, sports is not just medals, the participation times and medals gained grouped by gender are listed in the datatable.
Medal[is.na(Medal)] <- 0
Medaldatatable <- aggregate(cbind(Gold, Silver, Bronze, TotalMedal, Participate, Total, F, M) ~ region, data = Medal, sum) %>%
arrange(desc(TotalMedal))
datatable(Medaldatatable,
caption = "Table1: The aggregated medals every country has won over time",
filter = "top") %>%
formatStyle('region', color = 'white',
backgroundColor = 'red', fontWeight = 'bold')